deepsolar_data <- read_csv(file = "/Users/saptarshighose/Documents/Saptarshi/University of Chicago/Data Visualization/Data/deepsolar_tract.csv")
deepsolar_cost_panel_data <- read_csv(file = "/Users/saptarshighose/Documents/Saptarshi/University of Chicago/Data Visualization/Data/deepsolar_cost_panel_data.csv")
openpv_data <- read_csv(file = "/Users/saptarshighose/Documents/Saptarshi/University of Chicago/Data Visualization/Data/openpv_all.csv")
eia_total_coal_data <- read_csv(file = "/Users/saptarshighose/Documents/Saptarshi/University of Chicago/Data Visualization/Data/eia_total_coal_electricity_output.csv")
epa_air_pollution_data <- read_csv(file = "/Users/saptarshighose/Downloads/pollution_used.csv")
options(scipen=10000)
long <- melt(eia_total_coal_data, id.vars = c("Year"))
colnames(long)[2] <- "Coal-based Energy Production by Sector"
colnames(long)[3] <- "Total Coal-based Energy Production (Gigawatts)"

ggplot(long, aes(x=Year, y=long$"Total Coal-based Energy Production (Gigawatts)", fill=long$"Coal-based Energy Production by Sector")) +
  geom_area(colour="black", size=.2) +
  scale_fill_manual(values = c("#4ABF65", "#317F43", "#62FF86", "#194022", "#4FCE6D")) +
  scale_x_continuous(expand = c(0, 0), limits = c(2000,2017), breaks = (seq(2000,2017,1)), labels=(seq(2000,2017,1)))  + 
  geom_vline(xintercept=2011, color="black", linetype="solid") +
  geom_text(aes(x= 2011, y = 3000000, hjust = -.03, vjust = -10, label="New EPA Coal Regulations (2011)"),  color="black", fontface=1) +
  labs(title = "Coal-based Energy Has Fallen Sharply Over Time in the US",
       subtitle = "Total Coal-based Energy Production",
       caption="Source: US Energy Information Administration",
       x = "Year",
       y = "Total Coal Based Power (Gigawatts)") + 
  guides(fill=guide_legend(title="Coal Energy Sector")) + theme(text=element_text(size=12,  family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2),panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'),axis.text.x = element_text(angle = 45), plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



In this visualization, we can see hot total coal-based energy production across sectors has fluctuated over time in the United States. The Total Coal Based Power trend lines shows an overall decreaase of over 25% between 2000 and 2017. Further, we can see that the EPA’s implementation of new coal regulations under the Obama Administration in 2011 aligns with the downward trend in coal energy output in that time period. According to Reuters, change in coal-powered electric generation from utilities may be declining pat a particularly high rate because existing systems are often old and expensive to replace. In addition, they note that gas-powered generation is now very often a cheaper and less polluting alternative (https://www.reuters.com/article/us-usa-coal-kemp/us-power-producers-coal-consumption-falls-to-35-year-low-kemp-idUSKCN1M61ZX).



sampled_open_pv_data <- sample_frac(openpv_data, size = 0.2, replace=FALSE)  #Sample of full Open PV dataset
sampled_open_pv_data <- filter(sampled_open_pv_data, install_type %in% c("Agricultural","Commercial","Education","Government","Nonprofit","Residential"))

ggplot(sampled_open_pv_data, aes(x=install_type, y=size_kw, color = install_type, stroke = 1))  + 
geom_point() +
geom_jitter() + 
scale_colour_brewer(palette = "Greens")+ 
  labs(title = "Commercial and Government Photo Voltaic Systems Vary Most Widely in Size",
     subtitle = "Size and Cost of Photo Voltaic Systems by Type",
     caption = "Source: The Open Photo Voltaic Project (National Renewable Energy Lab)",
     color = "Type of Photo Voltaic System",
     x = "Type of Photo Voltaic System",
     y = "Total Size of Photo Voltaic System (Kilowatts)") +
ylim(0, 1500) +
theme(axis.text.x = element_text(angle = 90),
      strip.text = element_text(face = "plain", 
                                size = rel(1))) + theme(text=element_text(size=12,  family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2),panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'),plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



From this visualization, we can see that commercial, government, and non-profit applications have the widevest variation in the total size of photo voltaic systems (Kilowatts). The data shows that these systems reach nearly 1500KW in size. In contrast, residential and agricultural applications tend to be relatively much smaller – rarely exceeding 500KW in size.



ggplot(deepsolar_cost_panel_data, aes(x=reorder(State, -deepsolar_cost_panel_data$`PV Systems`), y=deepsolar_cost_panel_data$"PV Systems", size = deepsolar_cost_panel_data$"Average Cost of Electricity", stroke = 1),  alpha = 0.3)  + 
geom_point(colour="#4FCE6D") +
geom_text(aes(x= "CA", y = 693.250, hjust = -.1, vjust = 1, label="California (693,250 PV Systems)"),  color="dark green", size = 3) + 
geom_text(aes(x= "FL", y = 155.383, hjust = -.1, vjust = 1, label="Florida (155,383 PV Systems)"),  color="dark red", size = 3) + 
geom_segment(aes(x = "CA", y = 693.250, xend = "FL", yend = 155.383), size = 1, linejoin = "round", na.rm = FALSE,
             show.legend = NA, inherit.aes = TRUE, linetype = 'solid') + 
labs(title = "California Easily Leads the Nation in Building Photo Voltaic Systems",
     subtitle = "Count and Average Cost of Photo Voltaic Systems by State",
     caption = "Source: Stanford Deepsolar Project",
     size = "Average Cost per KW-hour \n (cents)",
     x = "State",
     y = "Count of Photo Voltaic Systems (Thousands)") +
theme(axis.text.x = element_text(angle = 90),
      strip.text = element_text(face = "plain", 
                                size = rel(1))) + theme(text=element_text(size=12,  family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2),panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'), axis.text.x = element_text(angle = 90), plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



From this chart, showing the number of photo voltaic systems present in each state, we can see that California leads the nation by several orders of magnitude compared to Florida, the next closest competitor. Further, cost per unit of electricity in these states varies widely. Illustratively, California leads the nation in photo voltaic systems, but pays significantly more per unit of electricity that Florida.



deepsolar_data$state = toupper(deepsolar_data$state)  

deepsolar_data %>%
  ggplot(aes(x = average_household_income)) +
  geom_histogram(color="#16401E", fill="#4FCE6D", bins = 8) +
  facet_wrap(~state) +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_log10() + 
  labs(title = "Average Income Distribution Varies Widely by State",
       subtitle = "Average Household Income by State (Census Tract Level)",
       caption="Source: The Deepsolar Project + The US Census",
       x = "Average Household Income",
       y = "Count of Census Tracts") + 
  theme(text=element_text(size=12,  family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2),panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'), axis.text.x = element_text(angle = 90),
        strip.text = element_text(face = "plain", 
                                  size = rel(1))) + theme(panel.grid.minor = element_blank(), plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



From this chart we can see that some of the states with the highest concentration of photo voltaic systems, like California and Florida, also seem to have more earners at the higher ends of the distrivution compared to other US states. Similarly, states like South Dakota and West Virginia that are on the lower end of the average income distribution are also low in terms of their concentration of photo voltaic systems. This suggests a possibly meaningful correlation between the wealth of state residents and the concentration of pv systems in their respective states.

epa_air_pollution_data$`Total Pollution Emitted` <- as.integer(epa_air_pollution_data$`Total Pollution Emitted`)

ggplot(epa_air_pollution_data, aes(x = Year, y = epa_air_pollution_data$"Total Pollution Emitted", color = epa_air_pollution_data$"Pollutant")) + 
geom_line(size = 5) +
scale_colour_brewer(palette = "Greens")+ 
geom_point(size = 1.8, color = "black") + 
scale_x_continuous(expand = c(0, 0), limits = c(2000,2017), breaks = (seq(2000,2017,1)), labels=(seq(2000,2017,1))) + 
  labs(title = "Air Pollution Emissions have Dropped Substantially Over Time",
       subtitle = "Emissions of CO, NOx, and SO2 in the US (Millions of Tons)",
       caption = "Source: EPA",
       color = "Air Pollutant Type",
       x = "Year",
       y = "Total Emissions (Millions of Tons)") +
  theme(axis.text.x = element_text(angle = 90),
        strip.text = element_text(face = "plain", 
                                  size = rel(1)),text=element_text(size=12,  family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2),panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'),plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



From this chart, we can trends over time for three of the major air pollutants. The concentration of each of these pollutants (CO, NOx,SO2) have all declined over time – both indidually and cumulitavely. CO emissions shaw the sharpest overall decline.



ggplot(epa_air_pollution_data, aes(x=Pollutant, y=epa_air_pollution_data$"Total Pollution Emitted",fill = Pollutant)) + 
  geom_violin(trim=FALSE, fill="white")+
  geom_boxplot(width=0.1)+
  labs(title = "Carbon Monoxide Levels Vary Widely In the US",
       subtitle = "Spread and Density of Major Air Pollutants in the US (2000-2017)",
       caption = "Source: EPA",
       color = "Air Pollutant Type",
       x = "Year",
       y = "Total Emissions (Millions of Tons)") +
  scale_fill_manual(values = c("#42BF5B", "#2C7F3C", "#58FF79")) +
  theme(text=element_text(family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2), panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'),plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



From this chart, we can see that among the three major pollutants shown, Carbon Monoxide levels vary most widely over time. Nitrogen Oxide and Sulfur Dioxide emissions vary much less – and they have a similar range (in millions of tons). As indicated by the bar in the middle of the box plot, the average values for these two air pollutants are also similar.



epa_air_pollution_data$`Total Pollution Emitted` <- as.integer(epa_air_pollution_data$`Total Pollution Emitted`)

ggplot() + geom_bar(aes(y = epa_air_pollution_data$"Total Pollution Emitted", x = Year, fill = Pollutant), data = epa_air_pollution_data,stat="identity") + geom_text(data=epa_air_pollution_data, aes(x = Year, y = epa_air_pollution_data$"Total Pollution Emitted", label = paste0(epa_air_pollution_data$"Total Pollution Emitted")),
                       size=4) + 
labs(title = "Total Emissions of Major Pollutants Have Been Steadily Declining Over Time ",
     subtitle = "Total Emission of Major Air Pollutants in the US 2000-2017",
     caption = "Source: EPA",
     color = "Air Pollutant Type",
     x = "Year",
     y = "Total Emissions (Millions of Tons)") +
scale_fill_manual(values = c("#42BF5B", "#2C7F3C", "#58FF79")) +
scale_x_continuous(expand = c(0, 0), limits = c(1999,2018), breaks = (seq(2000,2017,1)), labels=(seq(2000,2017,1))) +
theme(text=element_text(family="Arial"), panel.grid.minor = element_line(colour="white", size=0.2),panel.grid.major = element_line(colour="white", size=0.2), panel.background = element_rect(fill = 'white', colour = '#194022'),plot.title=element_text(size=16,color="black"), plot.subtitle=element_text(size=14, face="italic", color="black"), plot.caption=element_text(size=12, color="black"), axis.title =element_text(size=14, color="black"))



From this chart we can see further evidence that total emissions – among these three major air pollutants in the US – have declined over time. However, Carbon Monoxide and Sulfur Dioxide have declined the most substantially. According to the EPA, evolving regulatory action has been taken over the years - most notibaly under the Clean Air Act – to target and reduce all three of these toxic air pollutants (https://www.epa.gov/air-trends/).